# -*- coding: utf-8 -*-
# @Time     : 2022/6/20 10:36
# @Author   : JustFly
# @File     : pc_httpxAsyncio_parsel.py
# @Software : PyCharm
"""
|~||||||||||王炸 ！！！！|||||||||~|

httpx异步 + parsel 以爬取链家二手房信息为例

# 1.获取总页数
# 2.每一页都分配给一个 asyncio协成 进行爬取和数据解析

加入任务 34....
Write a CSV file to path 浦东_三房_500_800万(httpxAsyncio+parsel).csv Successful.
耗时：3.5270493030548096 s

进程已结束，退出代码为 0

"""


import re
import csv
import time
import httpx
from parsel import Selector
from fake_useragent import UserAgent

import asyncio


class HomeLinkSpider(object):
    def __init__(self):
        self.ua = UserAgent()
        self.headers = {"User-Agent": self.ua.random}
        self.data = list()
        self.path = "浦东_三房_500_800万(httpxAsyncio+parsel).csv"
        self.url = "https://sh.lianjia.com/ershoufang/pudong/p5/"

    def get_max_page(self):
        res = httpx.get(self.url, headers=self.headers)
        if res.status_code == 200:
            # create Selector instance
            selector = Selector(res.text)
            # use CSS acquire max_page div Box
            a = selector.css('div[class="page-box house-lst-page-box"]')
            # use eval() translate page-data JSON to DICT
            max_page = eval(a[0].xpath('//@page-data').get())['totalPage']
            print(f'最大页码数: {max_page}')
            return max_page
        else:
            print(f'请求失败 status: {res.status_code}')
            return None

    # 异步 - 使用协程函数解析单页面，需要传入单页面url地址

    async def parse_single_page(self, url):
        # max_page = self.get_max_page()
        # for i in range(1, max_page + 1):
        #     print(f'parse page {i} ...')
        #     url = f'https://sh.lianjia.com/ershoufang/pudong/pg{i}p5/'
        #     res = httpx.get(url, headers=self.headers)

        async with httpx.AsyncClient() as client:
            res = await client.get(url, headers=self.headers)
            selector = Selector(res.text)
            ul = selector.css('ul.sellListContent')[0]
            li_list = ul.css('li')
            for li in li_list:
                detail = dict()
                detail['title'] = li.css('div.title a::text').get()

                #  2室1厅 | 74.14平米 | 南 | 精装 | 高楼层(共6层) | 1999年建 | 板楼
                house_info = li.css('div.houseInfo::text').get()
                house_info_list = house_info.split(" | ")
                # 户型 面积 朝向
                detail['bedroom'] = house_info_list[0]
                detail['area'] = house_info_list[1]
                detail['direction'] = house_info_list[2]
                # 楼层
                floor_pattern = re.compile(r'\d{1,2}')
                match1 = re.search(floor_pattern, house_info_list[4])
                if match1:
                    detail['floor'] = match1.group()
                else:
                    detail['floor'] = "未知"
                # 年份
                year_pattern = re.compile(r'\d{4}')
                match2 = re.search(year_pattern, house_info_list[5])
                if match2:
                    detail['year'] = match2.group()
                else:
                    detail['year'] = "未知"

                # 文兰小区 - 塘桥， 提取小区名 及所在区域
                position_info = li.css('div.positionInfo a::text').getall()
                detail['house'] = position_info[0]
                detail['location'] = position_info[1]

                # 650万， 匹配售价650
                price_pattern = re.compile(r'\d+')
                total_price = li.css('div.totalPrice span::text').get()
                detail['total_price'] = total_price
                # detail['total_price'] = re.search(price_pattern, total_price).group()

                # 64,5182元/平米，匹配单价645182
                unit_price = li.css('div.unitPrice span::text').get()
                unit_price = unit_price.replace(",", "")
                detail['unit_price'] = re.search(price_pattern, unit_price).group()
                # print(detail)
                self.data.append(detail)
    def parse_page(self):
        max_page = self.get_max_page()
        loop = asyncio.get_event_loop()

        tasks = []
        for i in range(1, max_page + 1):
            url = f"https://sh.lianjia.com/ershoufang/pudong/pg{i}p5/"
            print(f"加入任务 {i}....")
            tasks.append(self.parse_single_page(url))

        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

    def write_csv_file(self):
        head = ["标题", "小区", "房厅", "面积", "朝向", "楼层", "年份", "位置", "总价(万)", "单价(元/平方米)"]
        keys = ["title", "house", "bedroom", "area", "direction", "floor", "year", "location", "total_price",
                "unit_price"]

        try:
            with open(self.path, 'w', newline='', encoding='utf_8_sig') as csv_file:
                writer = csv.writer(csv_file, dialect='excel')
                if head is not None:
                    writer.writerow(head)
                for item in self.data:
                    row_data = []
                    for k in keys:
                        row_data.append(item[k])
                        # print(row_data)
                    writer.writerow(row_data)
                print("Write a CSV file to path %s Successful." % self.path)
        except Exception as e:
            print("Fail to write CSV to path: %s, Case: %s" % (self.path, e))


if __name__ == '__main__':
    start = time.time()

    home_link_spider = HomeLinkSpider()
    home_link_spider.parse_page()
    home_link_spider.write_csv_file()

    end = time.time()
    print(f"耗时：{end - start} s")


